#ifndef _ZAD2_KERNEL_H_
#define _ZAD2_KERNEL_H_
//#include <stdio.h>

#define RADIUS 2
#define BLOCK_SIZE 512

__global__ void zad2Kernel(float* d_input, float* d_output) {
	__shared__ float sBuf[BLOCK_SIZE + 2*RADIUS];

	const unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
	const unsigned int localId = threadIdx.x + RADIUS;

	sBuf[localId] = d_input[tid];
	if(threadIdx.x < RADIUS) {
		sBuf[localId - RADIUS] = d_input[tid - RADIUS];
		sBuf[localId + BLOCK_SIZE] = d_input[tid + BLOCK_SIZE];
	}
	__syncthreads();

	if(tid < RADIUS || tid >= BLOCK_SIZE * gridDim.x - RADIUS) {
		d_output[tid] = sBuf[localId];
		return;
	}

	float res = 0;

	for(int i = -RADIUS; i <= RADIUS; i++)
		res += sBuf[localId + i];

	d_output[tid] = res / (2 * RADIUS + 1.f);
}

#endif